<<<<<<< HEAD ======= <<<<<<< HEAD ======= >>>>>>> 68d422ca6e68caffd5c2857c12f98a28d5ed4997 >>>>>>> 9b9a21e38492699a773761d3c924a5df9d2e013b
# install.packages("tidyverse");
# install.packages("rgdal");
library(tidyverse)
require("maps")
library(geosphere)
library(stringr)
library(rgdal)
library(caret)
<<<<<<< HEAD
library(lubridate)

Attaching package: ‘lubridate’

The following object is masked from ‘package:base’:

    date
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
path.to.csv <- '../Milestone 2/Seattle_Police_Department_911_Incident_Response_Oct17.csv'
=======
library(lubridate)
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
<<<<<<< HEAD
library(ggmap)
path.to.csv <- '../Milestone 2/Seattle_Police_Department_911_Incident_Response_Oct17.csv'
=======
path.to.csv <- '../Year_911_Data.csv'
>>>>>>> 68d422ca6e68caffd5c2857c12f98a28d5ed4997
>>>>>>> 9b9a21e38492699a773761d3c924a5df9d2e013b
spd.911 <- read.csv(path.to.csv, TRUE)
spd.911$clearance_date_ts = as.POSIXct(strptime(spd.911$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
spd.911$clearance_date_date = as.Date(spd.911$clearance_date_ts)
View(spd.911)
# path to the FOLDER with the .shp file in it. the second param is the name of the .shp file
# seattle <- readOGR(dsn = path.expand("~/documents/INFO370/project-teamname-v2/maps-api-test"), layer = "Seattle_City_Limits")
# usa <- map_data("state")
# data <- merge(usa, spd.911)
# Red Square coordinates
here_long <-  -122.3095
here_lat <- 47.6560
seattle = get_map(location = c(here_long, here_lat), zoom = 13, maptype = 'roadmap')
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=47.656,-122.3095&zoom=13&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false
<<<<<<< HEAD

======= <<<<<<< HEAD
spd.911 <- spd.911 %>% 
             rowwise() %>% 
             mutate(dist=distVincentyEllipsoid(c(Longitude, Latitude), c(here_long, here_lat)))              
nrow(spd.911)
[1] 20197
descriptions <- c("STRONG ARM ROBBERY", "PERSON WITH A WEAPON (NOT GUN)", "HAZARDS", "HARRASMENT, THREATS", "FIGHT DISTURBANCE", "CRISIS COMPLAINT - GENERAL", "ARMED ROBBERY")
data.ped <- spd.911 %>% filter(str_detect(Event.Clearance.Description, paste(descriptions, collapse="|")))
# data.ped <- data.now
nrow(data.ped)
[1] 1068
View(data.ped)
data.now <- data.ped %>% filter(clearance_date_ts < '2017-10-31 00:00:00')
nrow(data.now)
[1] 1068
                  
data.here <- data.now %>% filter(dist < 4600)
data <- data.here
nrow(data)
[1] 154
# View(data)
ggmap(seattle) +
   geom_point(data = data, aes(x = Longitude, y = Latitude), colour = "red", alpha = 0.75)

  #coord_map()
=======
>>>>>>> 68d422ca6e68caffd5c2857c12f98a28d5ed4997 >>>>>>> 9b9a21e38492699a773761d3c924a5df9d2e013b
freq_by_desc <- table(droplevels(data$Event.Clearance.Description))
# View(freq_by_desc)
ggplot(as.data.frame(freq_by_desc), 
       aes(x = Var1, y = Freq)) +
       geom_bar(stat = 'identity') +# create bar plot
<<<<<<< HEAD
    coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations
=======
<<<<<<< HEAD
    coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations
=======
    coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations

>>>>>>> 68d422ca6e68caffd5c2857c12f98a28d5ed4997
>>>>>>> 9b9a21e38492699a773761d3c924a5df9d2e013b
        
ggmap(seattle) +
  geom_point(data = data, aes(x = Longitude, y = Latitude, group = Event.Clearance.Description, color = Event.Clearance.Description), alpha = 0.5) +
  facet_wrap(~ Event.Clearance.Description) +
  theme(axis.ticks = element_blank(), 
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none"
        )

# selecting just ID and location data
df_loc <- data %>% dplyr::select(CAD.CDW.ID, Longitude, Latitude)
# figuring out number of clusters
wss <- c()
# clusters 1 to 15
for (i in 1:15) {
  wss[i] <- sum(kmeans(df_loc, centers=i)$withinss)
}
plot(1:15, wss, type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares")

# fitting model
fit <- kmeans(df_loc, 10)
fit$centers # look at cluster sizes and means. want clusters to be about equal size
   CAD.CDW.ID Longitude Latitude
1     2110534 -122.3090 47.65473
2     2108019 -122.3145 47.65467
3     2114454 -122.3135 47.66288
4     2115894 -122.3117 47.66063
5     2117445 -122.3115 47.66015
6     2105779 -122.3122 47.65821
7     2119756 -122.3128 47.65627
8     2122787 -122.3178 47.66599
9     2112589 -122.3124 47.66454
10    2124681 -122.3089 47.65948
fit$cluster
 [1]  6  6  6  6  6  6  6  6  6  2  2  2  2  2  2  1  1  1  1  1  1  1  9  9  9  9  9  9  9  9  9  9  3  3
[35]  3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5  5  7  7  7  7  7  7  7  7
[69]  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8 10 10 10 10 10 10 10 10 10
cluster.size <- data.frame(1:10, fit$size)
cluster.size
ggplot(data = cluster.size, aes(x = X1.10, y = fit.size)) +
  geom_bar(stat = 'identity')

ggplot()

ggmap(seattle) +
  geom_point(data = as.data.frame(fit$centers), aes(x = Longitude, y = Latitude), alpha = 0.5)

# looking at cluster means
aggregate(df_loc, by=list(fit$cluster), FUN=mean)
df_loc
# adding data back into dataframe 
# df_loc <- df_loc %>% mutate(cluster = fit$cluster) 
data$cluster <- fit$cluster
# View(data)
# timestamp ->  year  month day hour  minute
# sector -> to factor (there are 17 sectors)
# beat -> to factor (there are 3 beats per sector)
# clean the data a bit more
data$event_clearance_ts = as.POSIXct(strptime(data$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
data$event_clearance_date = as.Date(data$event_clearance_ts)
data$event_clearance_month = month(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_day = weekdays(data$event_clearance_date)
data$event_clearance_hr = hour(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_mn = minute(ymd_hms(as.character(data$event_clearance_ts)))
data$Initial.Type.Group = factor(data$Initial.Type.Group)
data$Event.Clearance.Group = factor(data$Event.Clearance.Group)
data$Zone.Beat = factor(data$Zone.Beat)
data$District.Sector = factor(data$District.Sector)
data$event_clearance_day = factor(data$event_clearance_day)
data
col.names <- paste(c(
  "Event.Clearance.Code"
  , "cluster"
  , "Census.Tract"
  , "event_clearance_day"
  , "Event.Clearance.Group"
  , "Event.Clearance.SubGroup"
  , "District.Sector"
  , "Zone.Beat"
  #, "event_clearance_ts"
  # ,"Incident.Location"
  , "event_clearance_hr"
  , "event_clearance_mn"
  , "event_clearance_month" 
  , "Hundred.Block.Location"
  ), collapse="|")
cols <- grep(col.names, colnames(data))
cols
 [1]  4  6  7  9 10 11 12 23 26 27 28 29
# corr_matrix <- cor(data[,cols]) # correlations between all predictor vars
# corr_matrix
# cutoff <- 0.5 # should be higher in practice
# highly_corr <- findCorrelation(corr_matrix, cutoff=cutoff)
# print(colnames(spd.911)[highly_corr]) # age is highly correalted with pregnant
train.data <- select(data, cols)
train.data
# data <- data %>% droplevels()
# grep("Hundred.Block.Location", colnames(train.data), invert = T)
predictors <- grep("Hundred.Block.Location", colnames(train.data), invert = T)
outcome <- grep("Hundred.Block.Location", colnames(train.data))
# train.data[,predictors]
frame <- data.frame(train.data[,predictors])
frame
out.factor <- train.data$Hundred.Block.Location
as.vector(out.factor)
 [1] "43XX BLOCK OF 15 AV NE"                 "24XX BLOCK OF E LOUISA ST"             
 [3] "48XX BLOCK OF SAND POINT WY NE"         "42XX BLOCK OF UNIVERSITY WY NE"        
 [5] "21XX BLOCK OF N NORTHLAKE WY"           "14XX BLOCK OF NE 43 ST"                
 [7] "14XX BLOCK OF NE 43 ST"                 "17XX BLOCK OF N 45 ST"                 
 [9] "NE 54 ST / 21 AV NE"                    "29XX BLOCK OF FAIRVIEW AV E"           
[11] "SAND POINT WY NE / 40 AV NE"            "55XX BLOCK OF 12 AV NE"                
[13] "NE 43 ST / UNIVERSITY WY NE"            "E ROANOKE ST / I5 NB"                  
[15] "E HAMLIN ST / EASTLAKE AV E"            "23XX BLOCK OF 24 AV E"                 
[17] "50XX BLOCK OF UNIVERSITY WY NE"         "E MONTLAKE PL E / E LAKE WASHINGTON BV"
[19] "14XX BLOCK OF NE 43 ST"                 "16XX BLOCK OF INTERLAKEN PL E"         
[21] "8XX BLOCK OF NE 66 ST"                  "NE 47 ST / 17 AV NE"                   
[23] "45XX BLOCK OF 16 AV NE"                 "16XX BLOCK OF NE 50 ST"                
[25] "45XX BLOCK OF UNIVERSITY WY NE"         "47XX BLOCK OF UNIVERSITY WY NE"        
[27] "NE 52 ST / ROOSEVELT WY NE"             "NE 50 ST / 9 AV NE"                    
[29] "45XX BLOCK OF 25 AV NE"                 "61XX BLOCK OF BROOKLYN AV NE"          
[31] "47XX BLOCK OF UNIVERSITY WY NE"         "45XX BLOCK OF UNIVERSITY WY NE"        
[33] "15 AV NE / NE 55 ST"                    "NE 45 ST / 11 AV NE"                   
[35] "46XX BLOCK OF 27 AV NE"                 "2 AV NE / NE 50 ST"                    
[37] "11 AV NE / NE 42 ST"                    "50XX BLOCK OF 7 AV NE"                 
[39] "NE BLAKELEY ST / 25 AV NE"              "4213 1 / 2 UNIVERSITY WY NE"           
[41] "15 AV NE / NE 42 ST"                    "NE 47 ST / 9 AV NE"                    
[43] "NE 45 ST / 5 AV NE"                     "NE 52 ST / 15 AV NE"                   
[45] "47XX BLOCK OF 30 AV NE"                 "UNIVERSITY WY NE / NE 56 ST"           
[47] "ROOSEVELT WY NE / NE 45 ST"             "50XX BLOCK OF UNIVERSITY WY NE"        
[49] "41XX BLOCK OF UNIVERSITY WY NE"         "45XX BLOCK OF 15 AV NE"                
[51] "27XX BLOCK OF MONTLAKE BV E"            "14XX BLOCK OF N 42 ST"                 
[53] "38XX BLOCK OF 42 AV NE"                 "42XX BLOCK OF UNIVERSITY WY NE"        
[55] "48XX BLOCK OF SAND POINT WY NE"         "24XX BLOCK OF E LOUISA ST"             
[57] "37XX BLOCK OF CORLISS AV N"             "45XX BLOCK OF 18 AV NE"                
[59] "ROOSEVELT WY NE / NE 63 ST"             "NE 50 ST / 2 AV NE"                    
[61] "45XX BLOCK OF 25 AV NE"                 "48XX BLOCK OF SAND POINT WY NE"        
[63] "45XX BLOCK OF 9 AV NE"                  "14XX BLOCK OF N 45 ST"                 
[65] "15 AV NE / NE 50 ST"                    "23XX BLOCK OF MINOR AV E"              
[67] "24 AV E / E MONTLAKE PL E"              "50XX BLOCK OF ROOSEVELT WY NE"         
[69] "8XX BLOCK OF NE 42 ST"                  "40XX BLOCK OF UNIVERSITY WY NE"        
[71] "38XX BLOCK OF NE 57 ST"                 "9XX BLOCK OF E ROANOKE ST"             
[73] "23XX BLOCK OF EASTLAKE AV E"            "BURKE AV N / N NORTHLAKE WY"           
[75] "22XX BLOCK OF NE 51 ST"                 "52XX BLOCK OF 22 AV NE"                
[77] "N 45 ST / BURKE AV N"                   "LATONA AV NE / NE 58 ST"               
[79] "1 AV NE / N 56 ST"                      "ROOSEVELT WY NE / NE 65 ST"            
[81] "43XX BLOCK OF UNIVERSITY WY NE"         "65XX BLOCK OF 18 AV NE"                
[83] "50XX BLOCK OF 19 AV NE"                 "EASTLAKE AV E / E BOSTON ST"           
[85] "45XX BLOCK OF 12 AV NE"                 "15 AV NE / NE 50 ST"                   
[87] "14XX BLOCK OF NE 43 ST"                 "22 AV E / E MILLER ST"                 
[89] "50XX BLOCK OF RAVENNA AV NE"            "52XX BLOCK OF 15 AV NE"                
[91] "55XX BLOCK OF 17 AV NE"                 "39 AV NE / NE 55 ST"                   
control <- rfeControl(functions = rfFuncs, method="cv", number=10)
results <- rfe(frame, out.factor, sizes = c(1:13), rfeControl = control) # this will take AWHILE...
Error in { : task 1 failed - "Can't have empty classes in y."
<<<<<<< HEAD
---
title: "R Notebook"
output: html_notebook
---

```{r setup}
# install.packages("tidyverse");
# install.packages("rgdal");
library(tidyverse)
require("maps")
library(geosphere)
library(stringr)
library(rgdal)
library(caret)
library(lubridate)
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
library(ggmap)
path.to.csv <- '../Milestone 2/Seattle_Police_Department_911_Incident_Response_Oct17.csv'
spd.911 <- read.csv(path.to.csv, TRUE)

spd.911$clearance_date_ts = as.POSIXct(strptime(spd.911$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
spd.911$clearance_date_date = as.Date(spd.911$clearance_date_ts)
View(spd.911)



# path to the FOLDER with the .shp file in it. the second param is the name of the .shp file
# seattle <- readOGR(dsn = path.expand("~/documents/INFO370/project-teamname-v2/maps-api-test"), layer = "Seattle_City_Limits")

# usa <- map_data("state")
# data <- merge(usa, spd.911)
# Red Square coordinates
here_long <-  -122.3095
here_lat <- 47.6560

seattle = get_map(location = c(here_long, here_lat), zoom = 13, maptype = 'roadmap')

```


```{r}


spd.911 <- spd.911 %>% 
             rowwise() %>% 
             mutate(dist=distVincentyEllipsoid(c(Longitude, Latitude), c(here_long, here_lat)))              
nrow(spd.911)

descriptions <- c("STRONG ARM ROBBERY", "PERSON WITH A WEAPON (NOT GUN)", "HAZARDS", "HARASSMENT, THREATS", "FIGHT DISTURBANCE", "CRISIS COMPLAINT - GENERAL", "ARMED ROBBERY")

# Removes Specifically Harassment by Telephone and Writing, as well as other non-scary crimes
data.ped <- spd.911 %>% filter(str_detect(Event.Clearance.Description, paste(descriptions, collapse="|"))) %>% filter(!str_detect(Event.Clearance.Description, "HARASSMENT, THREATS - BY TELEPHONE, WRITING"))
# data.ped <- data.now
nrow(data.ped)

# data.now <- data.ped %>% filter(clearance_date_ts < '2017-10-31 00:00:00')
nrow(data.now)
                  
data.here <- data.now %>% filter(dist < 2600)

data <- read.csv('YearSEAPD.csv', header = TRUE)
nrow(data)
# View(data)

ggmap(seattle) +
   geom_point(data = data, aes(x = Longitude, y = Latitude), colour = "red", alpha = 0.75)
  #coord_map()

```

```{r}
freq_by_desc <- table(droplevels(data$Event.Clearance.Description))
# View(freq_by_desc)

ggplot(as.data.frame(freq_by_desc), 
       aes(x = Var1, y = Freq)) +
       geom_bar(stat = 'identity') +# create bar plot
    coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations

        
```

```{r}
ggmap(seattle) +
  geom_point(data = data, aes(x = Longitude, y = Latitude, group = Event.Clearance.Description, color = Event.Clearance.Description), alpha = 0.5) +
  facet_wrap(~ Event.Clearance.Description) +
  theme(axis.ticks = element_blank(), 
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none"
        )
```

```{r}
# selecting just ID and location data
df_loc <- data %>% dplyr::select(CAD.CDW.ID, Longitude, Latitude)

# figuring out number of clusters
wss <- c()
# clusters 1 to 15
for (i in 1:15) {
  wss[i] <- sum(kmeans(df_loc, centers=i)$withinss)
}
plot(1:15, wss, type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares")

# fitting model
fit <- kmeans(df_loc, 10)
fit$centers # look at cluster sizes and means. want clusters to be about equal size
fit$cluster
cluster.size <- data.frame(1:10, fit$size)
cluster.size

ggplot(data = cluster.size, aes(x = X1.10, y = fit.size)) +
  geom_bar(stat = 'identity')
ggplot()
ggmap(seattle) +
  geom_point(data = as.data.frame(fit$centers), aes(x = Longitude, y = Latitude), alpha = 0.5)
# looking at cluster means
aggregate(df_loc, by=list(fit$cluster), FUN=mean)

df_loc

# adding data back into dataframe 
# df_loc <- df_loc %>% mutate(cluster = fit$cluster) 
data$cluster <- fit$cluster

# View(data)
```

```{r}
# timestamp ->  year  month day hour  minute
# sector -> to factor (there are 17 sectors)
# beat -> to factor (there are 3 beats per sector)

# clean the data a bit more
data$event_clearance_ts = as.POSIXct(strptime(data$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
data$event_clearance_date = as.Date(data$event_clearance_ts)
data$event_clearance_month = month(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_day = weekdays(data$event_clearance_date)
data$event_clearance_hr = hour(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_mn = minute(ymd_hms(as.character(data$event_clearance_ts)))
data$Initial.Type.Group = factor(data$Initial.Type.Group)
data$Event.Clearance.Group = factor(data$Event.Clearance.Group)
data$Zone.Beat = factor(data$Zone.Beat)
data$District.Sector = factor(data$District.Sector)
data$event_clearance_day = factor(data$event_clearance_day)
data

col.names <- paste(c(
  "Event.Clearance.Code"
  , "cluster"
  , "Census.Tract"
  , "event_clearance_day"
  , "Event.Clearance.Group"
  , "Event.Clearance.SubGroup"
  , "District.Sector"
  , "Zone.Beat"
  #, "event_clearance_ts"
  # ,"Incident.Location"
  , "event_clearance_hr"
  , "event_clearance_mn"
  , "event_clearance_month" 
  , "Hundred.Block.Location"
  ), collapse="|")
cols <- grep(col.names, colnames(data))
cols
# corr_matrix <- cor(data[,cols]) # correlations between all predictor vars
# corr_matrix

# cutoff <- 0.5 # should be higher in practice

# highly_corr <- findCorrelation(corr_matrix, cutoff=cutoff)
# print(colnames(spd.911)[highly_corr]) # age is highly correalted with pregnant

train.data <- select(data, cols)
train.data
# data <- data %>% droplevels()

# grep("Hundred.Block.Location", colnames(train.data), invert = T)

predictors <- grep("Hundred.Block.Location", colnames(train.data), invert = T)
outcome <- grep("Hundred.Block.Location", colnames(train.data))

# train.data[,predictors]
frame <- data.frame(train.data[,predictors])
frame
out.factor <- train.data$Hundred.Block.Location
as.vector(out.factor)


control <- rfeControl(functions = rfFuncs, method="cv", number=10)
results <- rfe(frame, out.factor, sizes = c(1:13), rfeControl = control) # this will take AWHILE...

results
ggplot(results)

# chosen features
predictors(results)
```

======= <<<<<<< HEAD
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3Igc2V0dXB9CiMgaW5zdGFsbC5wYWNrYWdlcygidGlkeXZlcnNlIik7CiMgaW5zdGFsbC5wYWNrYWdlcygicmdkYWwiKTsKbGlicmFyeSh0aWR5dmVyc2UpCnJlcXVpcmUoIm1hcHMiKQpsaWJyYXJ5KGdlb3NwaGVyZSkKbGlicmFyeShzdHJpbmdyKQpsaWJyYXJ5KHJnZGFsKQpsaWJyYXJ5KGNhcmV0KQpsaWJyYXJ5KGx1YnJpZGF0ZSkKaWYgKCFyZXF1aXJlKGdnbWFwKSkgeyBpbnN0YWxsLnBhY2thZ2VzKCdnZ21hcCcpOyByZXF1aXJlKGdnbWFwKSB9CnBhdGgudG8uY3N2IDwtICcuLi9NaWxlc3RvbmUgMi9TZWF0dGxlX1BvbGljZV9EZXBhcnRtZW50XzkxMV9JbmNpZGVudF9SZXNwb25zZV9PY3QxNy5jc3YnCnNwZC45MTEgPC0gcmVhZC5jc3YocGF0aC50by5jc3YsIFRSVUUpCgpzcGQuOTExJGNsZWFyYW5jZV9kYXRlX3RzID0gYXMuUE9TSVhjdChzdHJwdGltZShzcGQuOTExJEV2ZW50LkNsZWFyYW5jZS5EYXRlLCAiJW0vJWQvJVkgJUk6JU06JVMgJXAiKSkKc3BkLjkxMSRjbGVhcmFuY2VfZGF0ZV9kYXRlID0gYXMuRGF0ZShzcGQuOTExJGNsZWFyYW5jZV9kYXRlX3RzKQojIFZpZXcoc3BkLjkxMSkKCgoKIyBwYXRoIHRvIHRoZSBGT0xERVIgd2l0aCB0aGUgLnNocCBmaWxlIGluIGl0LiB0aGUgc2Vjb25kIHBhcmFtIGlzIHRoZSBuYW1lIG9mIHRoZSAuc2hwIGZpbGUKIyBzZWF0dGxlIDwtIHJlYWRPR1IoZHNuID0gcGF0aC5leHBhbmQoIn4vZG9jdW1lbnRzL0lORk8zNzAvcHJvamVjdC10ZWFtbmFtZS12Mi9tYXBzLWFwaS10ZXN0IiksIGxheWVyID0gIlNlYXR0bGVfQ2l0eV9MaW1pdHMiKQoKIyB1c2EgPC0gbWFwX2RhdGEoInN0YXRlIikKIyBkYXRhIDwtIG1lcmdlKHVzYSwgc3BkLjkxMSkKaGVyZV9sb25nIDwtICAtMTIyLjMwCmhlcmVfbGF0IDwtIDQ3LjY2CgpzZWF0dGxlID0gZ2V0X21hcChsb2NhdGlvbiA9IGMoaGVyZV9sb25nLCBoZXJlX2xhdCksIHpvb20gPSAxMywgbWFwdHlwZSA9ICdyb2FkbWFwJykKCmBgYAoKCmBgYHtyfQpzcGQuOTExIDwtIHNwZC45MTEgJT4lIAogICAgICAgICAgICAgcm93d2lzZSgpICU+JSAKICAgICAgICAgICAgIG11dGF0ZShkaXN0PWRpc3RWaW5jZW50eUVsbGlwc29pZChjKExvbmdpdHVkZSwgTGF0aXR1ZGUpLCBjKGhlcmVfbG9uZywgaGVyZV9sYXQpKSkgICAgICAgICAgICAgIApucm93KHNwZC45MTEpCgpkZXNjcmlwdGlvbnMgPC0gYygiU1RST05HIEFSTSBST0JCRVJZIiwgIlBFUlNPTiBXSVRIIEEgV0VBUE9OIChOT1QgR1VOKSIsICJIQVpBUkRTIiwgIkhBUlJBU01FTlQsIFRIUkVBVFMiLCAiRklHSFQgRElTVFVSQkFOQ0UiLCAiQ1JJU0lTIENPTVBMQUlOVCAtIEdFTkVSQUwiLCAiQVJNRUQgUk9CQkVSWSIpCgpkYXRhLnBlZCA8LSBzcGQuOTExICU+JSBmaWx0ZXIoc3RyX2RldGVjdChFdmVudC5DbGVhcmFuY2UuRGVzY3JpcHRpb24sIHBhc3RlKGRlc2NyaXB0aW9ucywgY29sbGFwc2U9InwiKSkpCiMgZGF0YS5wZWQgPC0gZGF0YS5ub3cKbnJvdyhkYXRhLnBlZCkKVmlldyhkYXRhLnBlZCkKCmRhdGEubm93IDwtIGRhdGEucGVkICU+JSBmaWx0ZXIoY2xlYXJhbmNlX2RhdGVfdHMgPCAnMjAxNy0xMC0zMSAwMDowMDowMCcpCm5yb3coZGF0YS5ub3cpCiAgICAgICAgICAgICAgICAgIApkYXRhLmhlcmUgPC0gZGF0YS5ub3cgJT4lIGZpbHRlcihkaXN0IDwgNDYwMCkKCmRhdGEgPC0gZGF0YS5oZXJlCm5yb3coZGF0YSkKIyBWaWV3KGRhdGEpCgpnZ21hcChzZWF0dGxlKSArCiAgIGdlb21fcG9pbnQoZGF0YSA9IGRhdGEsIGFlcyh4ID0gTG9uZ2l0dWRlLCB5ID0gTGF0aXR1ZGUpLCBjb2xvdXIgPSAicmVkIiwgYWxwaGEgPSAwLjc1KQogICNjb29yZF9tYXAoKQoKYGBgCgpgYGB7cn0KZnJlcV9ieV9kZXNjIDwtIHRhYmxlKGRyb3BsZXZlbHMoZGF0YSRFdmVudC5DbGVhcmFuY2UuRGVzY3JpcHRpb24pKQojIFZpZXcoZnJlcV9ieV9kZXNjKQoKZ2dwbG90KGFzLmRhdGEuZnJhbWUoZnJlcV9ieV9kZXNjKSwgCiAgICAgICBhZXMoeCA9IFZhcjEsIHkgPSBGcmVxKSkgKwogICAgICAgZ2VvbV9iYXIoc3RhdCA9ICdpZGVudGl0eScpICsjIGNyZWF0ZSBiYXIgcGxvdAogICAgY29vcmRfZmxpcCgpCgojVHJhZmZpYyByZWxhdGVkIGNhbGxzLCBzdXNwaWNpb3VzIGNpcmN1bXN0YW5jZXMsIGFuZCBkaXN0dXJiYW5jZXMgYXJlIHRoZSB0aGUgbW9zdCBzaWduaWZpY2FudCB0aHJlYXRzIHRvIHBlZGVzdHJhdGlvbnMKCiAgICAgICAgCmBgYAoKYGBge3J9CmdnbWFwKHNlYXR0bGUpICsKICBnZW9tX3BvaW50KGRhdGEgPSBkYXRhLCBhZXMoeCA9IExvbmdpdHVkZSwgeSA9IExhdGl0dWRlLCBncm91cCA9IEV2ZW50LkNsZWFyYW5jZS5EZXNjcmlwdGlvbiwgY29sb3IgPSBFdmVudC5DbGVhcmFuY2UuRGVzY3JpcHRpb24pLCBhbHBoYSA9IDAuNSkgKwogIGZhY2V0X3dyYXAofiBFdmVudC5DbGVhcmFuY2UuRGVzY3JpcHRpb24pICsKICB0aGVtZShheGlzLnRpY2tzID0gZWxlbWVudF9ibGFuaygpLCAKICAgICAgICBheGlzLnRleHQueCA9IGVsZW1lbnRfYmxhbmsoKSwKICAgICAgICBheGlzLnRleHQueSA9IGVsZW1lbnRfYmxhbmsoKSwKICAgICAgICBsZWdlbmQucG9zaXRpb24gPSAibm9uZSIKICAgICAgICApCmBgYAoKYGBge3J9CiMgc2VsZWN0aW5nIGp1c3QgSUQgYW5kIGxvY2F0aW9uIGRhdGEKZGZfbG9jIDwtIGRhdGEgJT4lIGRwbHlyOjpzZWxlY3QoQ0FELkNEVy5JRCwgTG9uZ2l0dWRlLCBMYXRpdHVkZSkKCiMgZmlndXJpbmcgb3V0IG51bWJlciBvZiBjbHVzdGVycwp3c3MgPC0gYygpCiMgY2x1c3RlcnMgMSB0byAxNQpmb3IgKGkgaW4gMToxNSkgewogIHdzc1tpXSA8LSBzdW0oa21lYW5zKGRmX2xvYywgY2VudGVycz1pKSR3aXRoaW5zcykKfQpwbG90KDE6MTUsIHdzcywgdHlwZT0iYiIsIHhsYWI9Ik51bWJlciBvZiBDbHVzdGVycyIsCiAgeWxhYj0iV2l0aGluIGdyb3VwcyBzdW0gb2Ygc3F1YXJlcyIpCgojIGZpdHRpbmcgbW9kZWwKZml0IDwtIGttZWFucyhkZl9sb2MsIDEwKQpmaXQkY2VudGVycyAjIGxvb2sgYXQgY2x1c3RlciBzaXplcyBhbmQgbWVhbnMuIHdhbnQgY2x1c3RlcnMgdG8gYmUgYWJvdXQgZXF1YWwgc2l6ZQpmaXQkY2x1c3RlcgpjbHVzdGVyLnNpemUgPC0gZGF0YS5mcmFtZSgxOjEwLCBmaXQkc2l6ZSkKY2x1c3Rlci5zaXplCgpnZ3Bsb3QoZGF0YSA9IGNsdXN0ZXIuc2l6ZSwgYWVzKHggPSBYMS4xMCwgeSA9IGZpdC5zaXplKSkgKwogIGdlb21fYmFyKHN0YXQgPSAnaWRlbnRpdHknKQpnZ3Bsb3QoKQpnZ21hcChzZWF0dGxlKSArCiAgZ2VvbV9wb2ludChkYXRhID0gYXMuZGF0YS5mcmFtZShmaXQkY2VudGVycyksIGFlcyh4ID0gTG9uZ2l0dWRlLCB5ID0gTGF0aXR1ZGUpLCBhbHBoYSA9IDAuNSkKIyBsb29raW5nIGF0IGNsdXN0ZXIgbWVhbnMKYWdncmVnYXRlKGRmX2xvYywgYnk9bGlzdChmaXQkY2x1c3RlciksIEZVTj1tZWFuKQoKZGZfbG9jCgojIGFkZGluZyBkYXRhIGJhY2sgaW50byBkYXRhZnJhbWUgCiMgZGZfbG9jIDwtIGRmX2xvYyAlPiUgbXV0YXRlKGNsdXN0ZXIgPSBmaXQkY2x1c3RlcikgCmRhdGEkY2x1c3RlciA8LSBmaXQkY2x1c3RlcgoKIyBWaWV3KGRhdGEpCmBgYAoKYGBge3J9CiMgdGltZXN0YW1wIC0+ICB5ZWFyICBtb250aCBkYXkgaG91ciAgbWludXRlCiMgc2VjdG9yIC0+IHRvIGZhY3RvciAodGhlcmUgYXJlIDE3IHNlY3RvcnMpCiMgYmVhdCAtPiB0byBmYWN0b3IgKHRoZXJlIGFyZSAzIGJlYXRzIHBlciBzZWN0b3IpCgojIGNsZWFuIHRoZSBkYXRhIGEgYml0IG1vcmUKZGF0YSRldmVudF9jbGVhcmFuY2VfdHMgPSBhcy5QT1NJWGN0KHN0cnB0aW1lKGRhdGEkRXZlbnQuQ2xlYXJhbmNlLkRhdGUsICIlbS8lZC8lWSAlSTolTTolUyAlcCIpKQpkYXRhJGV2ZW50X2NsZWFyYW5jZV9kYXRlID0gYXMuRGF0ZShkYXRhJGV2ZW50X2NsZWFyYW5jZV90cykKZGF0YSRldmVudF9jbGVhcmFuY2VfbW9udGggPSBtb250aCh5bWRfaG1zKGFzLmNoYXJhY3RlcihkYXRhJGV2ZW50X2NsZWFyYW5jZV90cykpKQpkYXRhJGV2ZW50X2NsZWFyYW5jZV9kYXkgPSB3ZWVrZGF5cyhkYXRhJGV2ZW50X2NsZWFyYW5jZV9kYXRlKQpkYXRhJGV2ZW50X2NsZWFyYW5jZV9ociA9IGhvdXIoeW1kX2htcyhhcy5jaGFyYWN0ZXIoZGF0YSRldmVudF9jbGVhcmFuY2VfdHMpKSkKZGF0YSRldmVudF9jbGVhcmFuY2VfbW4gPSBtaW51dGUoeW1kX2htcyhhcy5jaGFyYWN0ZXIoZGF0YSRldmVudF9jbGVhcmFuY2VfdHMpKSkKZGF0YSRJbml0aWFsLlR5cGUuR3JvdXAgPSBmYWN0b3IoZGF0YSRJbml0aWFsLlR5cGUuR3JvdXApCmRhdGEkRXZlbnQuQ2xlYXJhbmNlLkdyb3VwID0gZmFjdG9yKGRhdGEkRXZlbnQuQ2xlYXJhbmNlLkdyb3VwKQpkYXRhJFpvbmUuQmVhdCA9IGZhY3RvcihkYXRhJFpvbmUuQmVhdCkKZGF0YSREaXN0cmljdC5TZWN0b3IgPSBmYWN0b3IoZGF0YSREaXN0cmljdC5TZWN0b3IpCmRhdGEkZXZlbnRfY2xlYXJhbmNlX2RheSA9IGZhY3RvcihkYXRhJGV2ZW50X2NsZWFyYW5jZV9kYXkpCmRhdGEKCmNvbC5uYW1lcyA8LSBwYXN0ZShjKAogICJFdmVudC5DbGVhcmFuY2UuQ29kZSIKICAsICJjbHVzdGVyIgogICwgIkNlbnN1cy5UcmFjdCIKICAsICJldmVudF9jbGVhcmFuY2VfZGF5IgogICwgIkV2ZW50LkNsZWFyYW5jZS5Hcm91cCIKICAsICJFdmVudC5DbGVhcmFuY2UuU3ViR3JvdXAiCiAgLCAiRGlzdHJpY3QuU2VjdG9yIgogICwgIlpvbmUuQmVhdCIKICAjLCAiZXZlbnRfY2xlYXJhbmNlX3RzIgogICMgLCJJbmNpZGVudC5Mb2NhdGlvbiIKICAsICJldmVudF9jbGVhcmFuY2VfaHIiCiAgLCAiZXZlbnRfY2xlYXJhbmNlX21uIgogICwgImV2ZW50X2NsZWFyYW5jZV9tb250aCIgCiAgLCAiSHVuZHJlZC5CbG9jay5Mb2NhdGlvbiIKICApLCBjb2xsYXBzZT0ifCIpCmNvbHMgPC0gZ3JlcChjb2wubmFtZXMsIGNvbG5hbWVzKGRhdGEpKQpjb2xzCiMgY29ycl9tYXRyaXggPC0gY29yKGRhdGFbLGNvbHNdKSAjIGNvcnJlbGF0aW9ucyBiZXR3ZWVuIGFsbCBwcmVkaWN0b3IgdmFycwojIGNvcnJfbWF0cml4CgojIGN1dG9mZiA8LSAwLjUgIyBzaG91bGQgYmUgaGlnaGVyIGluIHByYWN0aWNlCgojIGhpZ2hseV9jb3JyIDwtIGZpbmRDb3JyZWxhdGlvbihjb3JyX21hdHJpeCwgY3V0b2ZmPWN1dG9mZikKIyBwcmludChjb2xuYW1lcyhzcGQuOTExKVtoaWdobHlfY29ycl0pICMgYWdlIGlzIGhpZ2hseSBjb3JyZWFsdGVkIHdpdGggcHJlZ25hbnQKCnRyYWluLmRhdGEgPC0gc2VsZWN0KGRhdGEsIGNvbHMpCnRyYWluLmRhdGEKIyBkYXRhIDwtIGRhdGEgJT4lIGRyb3BsZXZlbHMoKQoKIyBncmVwKCJIdW5kcmVkLkJsb2NrLkxvY2F0aW9uIiwgY29sbmFtZXModHJhaW4uZGF0YSksIGludmVydCA9IFQpCgpwcmVkaWN0b3JzIDwtIGdyZXAoIkh1bmRyZWQuQmxvY2suTG9jYXRpb24iLCBjb2xuYW1lcyh0cmFpbi5kYXRhKSwgaW52ZXJ0ID0gVCkKb3V0Y29tZSA8LSBncmVwKCJIdW5kcmVkLkJsb2NrLkxvY2F0aW9uIiwgY29sbmFtZXModHJhaW4uZGF0YSkpCgojIHRyYWluLmRhdGFbLHByZWRpY3RvcnNdCmZyYW1lIDwtIGRhdGEuZnJhbWUodHJhaW4uZGF0YVsscHJlZGljdG9yc10pCmZyYW1lCm91dC5mYWN0b3IgPC0gdHJhaW4uZGF0YSRIdW5kcmVkLkJsb2NrLkxvY2F0aW9uCmFzLnZlY3RvcihvdXQuZmFjdG9yKQoKCmNvbnRyb2wgPC0gcmZlQ29udHJvbChmdW5jdGlvbnMgPSByZkZ1bmNzLCBtZXRob2Q9ImN2IiwgbnVtYmVyPTEwKQpyZXN1bHRzIDwtIHJmZShmcmFtZSwgb3V0LmZhY3Rvciwgc2l6ZXMgPSBjKDE6MTMpLCByZmVDb250cm9sID0gY29udHJvbCkgIyB0aGlzIHdpbGwgdGFrZSBBV0hJTEUuLi4KCnJlc3VsdHMKZ2dwbG90KHJlc3VsdHMpCgojIGNob3NlbiBmZWF0dXJlcwpwcmVkaWN0b3JzKHJlc3VsdHMpCmBgYAo=
=======
---
title: "R Notebook"
output: html_notebook
---

```{r setup}
# install.packages("tidyverse");
# install.packages("rgdal");
library(tidyverse)
require("maps")
library(geosphere)
library(stringr)
library(rgdal)
library(caret)
library(lubridate)
if (!require(ggmap)) { install.packages('ggmap'); require(ggmap) }
path.to.csv <- '../Year_911_Data.csv'
spd.911 <- read.csv(path.to.csv, TRUE)

spd.911$clearance_date_ts = as.POSIXct(strptime(spd.911$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
spd.911$clearance_date_date = as.Date(spd.911$clearance_date_ts)
# View(spd.911)



# path to the FOLDER with the .shp file in it. the second param is the name of the .shp file
# seattle <- readOGR(dsn = path.expand("~/documents/INFO370/project-teamname-v2/maps-api-test"), layer = "Seattle_City_Limits")

# usa <- map_data("state")
# data <- merge(usa, spd.911)
# Red Square coordinates
here_long <-  -122.3095
here_lat <- 47.6560

seattle = get_map(location = c(here_long, here_lat), zoom = 13, maptype = 'roadmap')

```


```{r}
spd.911 <- spd.911 %>% 
             rowwise() %>% 
             mutate(dist=distVincentyEllipsoid(c(Longitude, Latitude), c(here_long, here_lat)))              
nrow(spd.911)

descriptions <- c("STRONG ARM ROBBERY", "PERSON WITH A WEAPON (NOT GUN)", "HAZARDS", "HARASSMENT, THREATS", "FIGHT DISTURBANCE", "CRISIS COMPLAINT - GENERAL", "ARMED ROBBERY")

# Removes Specifically Harassment by Telephone and Writing, as well as other non-scary crimes
data.ped <- spd.911 %>% filter(str_detect(Event.Clearance.Description, paste(descriptions, collapse="|"))) %>% filter(!str_detect(Event.Clearance.Description, "HARASSMENT, THREATS - BY TELEPHONE, WRITING"))
# data.ped <- data.now
nrow(data.ped)

data.now <- data.ped %>% filter(clearance_date_ts < '2017-10-31 00:00:00')
nrow(data.now)
                  
data.here <- data.now %>% filter(dist < 4600)

data <- data.here
nrow(data)
# View(data)

ggmap(seattle) +
   geom_point(data = data, aes(x = Longitude, y = Latitude), colour = "red", alpha = 0.75)
  #coord_map()

```

```{r}
freq_by_desc <- table(droplevels(data$Event.Clearance.Description))
# View(freq_by_desc)

ggplot(as.data.frame(freq_by_desc), 
       aes(x = Var1, y = Freq)) +
       geom_bar(stat = 'identity') +# create bar plot
    coord_flip()

#Traffic related calls, suspicious circumstances, and disturbances are the the most significant threats to pedestrations

        
```

```{r}
ggmap(seattle) +
  geom_point(data = data, aes(x = Longitude, y = Latitude, group = Event.Clearance.Description, color = Event.Clearance.Description), alpha = 0.5) +
  facet_wrap(~ Event.Clearance.Description) +
  theme(axis.ticks = element_blank(), 
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none"
        )
```

```{r}
# selecting just ID and location data
df_loc <- data %>% dplyr::select(CAD.CDW.ID, Longitude, Latitude)

# figuring out number of clusters
wss <- c()
# clusters 1 to 15
for (i in 1:15) {
  wss[i] <- sum(kmeans(df_loc, centers=i)$withinss)
}
plot(1:15, wss, type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares")

# fitting model
fit <- kmeans(df_loc, 10)
fit$centers # look at cluster sizes and means. want clusters to be about equal size
fit$cluster
cluster.size <- data.frame(1:10, fit$size)
cluster.size

ggplot(data = cluster.size, aes(x = X1.10, y = fit.size)) +
  geom_bar(stat = 'identity')
ggplot()
ggmap(seattle) +
  geom_point(data = as.data.frame(fit$centers), aes(x = Longitude, y = Latitude), alpha = 0.5)
# looking at cluster means
aggregate(df_loc, by=list(fit$cluster), FUN=mean)

df_loc

# adding data back into dataframe 
# df_loc <- df_loc %>% mutate(cluster = fit$cluster) 
data$cluster <- fit$cluster

# View(data)
```

```{r}
# timestamp ->  year  month day hour  minute
# sector -> to factor (there are 17 sectors)
# beat -> to factor (there are 3 beats per sector)

# clean the data a bit more
data$event_clearance_ts = as.POSIXct(strptime(data$Event.Clearance.Date, "%m/%d/%Y %I:%M:%S %p"))
data$event_clearance_date = as.Date(data$event_clearance_ts)
data$event_clearance_month = month(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_day = weekdays(data$event_clearance_date)
data$event_clearance_hr = hour(ymd_hms(as.character(data$event_clearance_ts)))
data$event_clearance_mn = minute(ymd_hms(as.character(data$event_clearance_ts)))
data$Initial.Type.Group = factor(data$Initial.Type.Group)
data$Event.Clearance.Group = factor(data$Event.Clearance.Group)
data$Zone.Beat = factor(data$Zone.Beat)
data$District.Sector = factor(data$District.Sector)
data$event_clearance_day = factor(data$event_clearance_day)
data

col.names <- paste(c(
  "Event.Clearance.Code"
  , "cluster"
  , "Census.Tract"
  , "event_clearance_day"
  , "Event.Clearance.Group"
  , "Event.Clearance.SubGroup"
  , "District.Sector"
  , "Zone.Beat"
  #, "event_clearance_ts"
  # ,"Incident.Location"
  , "event_clearance_hr"
  , "event_clearance_mn"
  , "event_clearance_month" 
  , "Hundred.Block.Location"
  ), collapse="|")
cols <- grep(col.names, colnames(data))
cols
# corr_matrix <- cor(data[,cols]) # correlations between all predictor vars
# corr_matrix

# cutoff <- 0.5 # should be higher in practice

# highly_corr <- findCorrelation(corr_matrix, cutoff=cutoff)
# print(colnames(spd.911)[highly_corr]) # age is highly correalted with pregnant

train.data <- select(data, cols)
train.data
# data <- data %>% droplevels()

# grep("Hundred.Block.Location", colnames(train.data), invert = T)

predictors <- grep("Hundred.Block.Location", colnames(train.data), invert = T)
outcome <- grep("Hundred.Block.Location", colnames(train.data))

# train.data[,predictors]
frame <- data.frame(train.data[,predictors])
frame
out.factor <- train.data$Hundred.Block.Location
as.vector(out.factor)


control <- rfeControl(functions = rfFuncs, method="cv", number=10)
results <- rfe(frame, out.factor, sizes = c(1:13), rfeControl = control) # this will take AWHILE...

results
ggplot(results)

# chosen features
predictors(results)
```

>>>>>>> 68d422ca6e68caffd5c2857c12f98a28d5ed4997 >>>>>>> 9b9a21e38492699a773761d3c924a5df9d2e013b